# load packages
library(readr)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(corrplot)
library(plotly)
library(reshape2)
library(car)
library(kableExtra)
library(broom)
library(knitr)
library(pROC)
library(ggpattern)
library(tidyr)
library(ResourceSelection)
library(sf)
# import dataset
female <- read_csv("/Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/Data/wm.csv")
# get a glimpse of dataset
head(female)
## # A tibble: 6 × 458
## HH1 HH2 LN WM1 WM2 WM3 WMINT WM4 WM5 WM6D WM6M WM6Y WM8
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2 3 1 2 3 92 91 92 19 11 2020 2
## 2 1 4 2 1 4 2 92 91 92 18 11 2020 1
## 3 1 9 4 1 9 4 92 91 92 18 11 2020 1
## 4 1 10 4 1 10 4 92 91 92 18 11 2020 2
## 5 1 11 4 1 11 4 92 91 92 19 11 2020 2
## 6 1 11 5 1 11 5 92 91 92 18 11 2020 2
## # ℹ 445 more variables: WM9 <dbl>, WM17 <dbl>, WM7H <dbl>, WM7M <dbl>,
## # WM10H <dbl>, WM10M <dbl>, WM11 <dbl>, WM12 <dbl>, WM13 <dbl>, WM14 <dbl>,
## # WM15 <dbl>, WM22 <dbl>, WM23 <dbl>, WM24 <dbl>, WMHINT <dbl>, WMFIN <dbl>,
## # WB3M <dbl>, WB3Y <dbl>, WB4 <dbl>, WB5 <dbl>, WB6A <dbl>, WB6B <dbl>,
## # WB7 <dbl>, WB9 <lgl>, WB10A <lgl>, WB10B <lgl>, WB11 <lgl>, WB12A <lgl>,
## # WB12B <lgl>, WB14 <dbl>, WB15 <dbl>, WB16 <dbl>, WB17 <dbl>, WB18 <dbl>,
## # WB19A <chr>, WB19B <chr>, WB19C <chr>, WB19D <chr>, WB19E <chr>, …
# Select the specified columns to create a new dataframe
female_df <- select(female, WAGEM, MSTATUS, HH6, HH7, welevel, insurance, ethnicity, windex5, CP2, HA1, MT4, MT9, MT11)
# View the first few rows of the new dataframe
summary(female_df)
## WAGEM MSTATUS HH6 HH7 welevel
## Min. :10.00 Min. :1.000 Min. :1.000 Min. :1.000 Min. :0.00
## 1st Qu.:18.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.00
## Median :20.00 Median :1.000 Median :2.000 Median :3.000 Median :2.00
## Mean :20.92 Mean :1.413 Mean :1.683 Mean :3.404 Mean :2.46
## 3rd Qu.:23.00 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:5.000 3rd Qu.:3.00
## Max. :47.00 Max. :9.000 Max. :2.000 Max. :6.000 Max. :9.00
## NA's :2026 NA's :11 NA's :11 NA's :11 NA's :524
## insurance ethnicity windex5 CP2
## Min. :1.000 Min. :1.000 Min. :0.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :1.000 Median :1.000 Median :2.000 Median :1.000
## Mean :1.135 Mean :2.035 Mean :2.494 Mean :1.431
## 3rd Qu.:1.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :9.000 Max. :6.000 Max. :5.000 Max. :9.000
## NA's :524 NA's :864
## HA1 MT4 MT9 MT11
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.00
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.00
## Median :1.000 Median :2.000 Median :1.000 Median :1.00
## Mean :1.215 Mean :1.664 Mean :1.365 Mean :1.12
## 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:1.00
## Max. :9.000 Max. :9.000 Max. :9.000 Max. :9.00
## NA's :524 NA's :524 NA's :2496 NA's :524
# Rename the columns
female_df <- female_df %>%
rename(
age_first_marriage = WAGEM,
marital_status = MSTATUS,
area = HH6,
region = HH7,
education_level = welevel,
health_insurance = insurance,
ethnicity = ethnicity,
wealth_index = windex5,
current_contraceptive_use = CP2,
awareness_hiv_aids = HA1,
used_computer_tablet = MT4,
used_internet = MT9,
owns_mobile_phone = MT11
)
# Summarize missing values by column
summarize_missing <- sapply(female_df, function(x) sum(is.na(x)))
print(summarize_missing)
## age_first_marriage marital_status area
## 2026 11 11
## region education_level health_insurance
## 11 524 524
## ethnicity wealth_index current_contraceptive_use
## 0 0 864
## awareness_hiv_aids used_computer_tablet used_internet
## 524 524 2496
## owns_mobile_phone
## 524
# Recode the value 9 to NA for specified variables
female_df <- female_df %>%
mutate(
current_contraceptive_use = na_if(current_contraceptive_use, 9),
used_internet = na_if(used_internet, 9),
health_insurance = na_if(health_insurance, 9),
education_level = na_if(education_level, 9),
awareness_hiv_aids = na_if(awareness_hiv_aids, 9),
used_computer_tablet = na_if(used_computer_tablet, 9),
owns_mobile_phone = na_if(owns_mobile_phone, 9),
marital_status = na_if(marital_status, 9)
)
# Recode the value 6 to NA for 'ethnicity'
female_df$ethnicity <- na_if(female_df$ethnicity, 6)
# Recode the value 0 to NA for 'wealth_index'
female_df$wealth_index <- na_if(female_df$wealth_index, 0)
# Recoding variables from 1 (Yes) and 2 (No) to 1 (Yes) and 0 (No)
female_df$health_insurance <- ifelse(female_df$health_insurance == 2, 0, female_df$health_insurance)
female_df$current_contraceptive_use <- ifelse(female_df$current_contraceptive_use == 2, 0, female_df$current_contraceptive_use)
female_df$awareness_hiv_aids <- ifelse(female_df$awareness_hiv_aids == 2, 0, female_df$awareness_hiv_aids)
female_df$used_computer_tablet <- ifelse(female_df$used_computer_tablet == 2, 0, female_df$used_computer_tablet)
female_df$owns_mobile_phone <- ifelse(female_df$owns_mobile_phone == 2, 0, female_df$owns_mobile_phone)
female_df$used_internet <- ifelse(female_df$used_internet == 2, 0, female_df$used_internet)
# View the changes to ensure NA substitution has been correctly applied
summary(female_df)
## age_first_marriage marital_status area region
## Min. :10.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:18.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000
## Median :20.00 Median :1.000 Median :2.000 Median :3.000
## Mean :20.92 Mean :1.408 Mean :1.683 Mean :3.404
## 3rd Qu.:23.00 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:5.000
## Max. :47.00 Max. :3.000 Max. :2.000 Max. :6.000
## NA's :2026 NA's :18 NA's :11 NA's :11
## education_level health_insurance ethnicity wealth_index
## Min. :0.00 Min. :0.0000 Min. :1.000 Min. :1.000
## 1st Qu.:1.00 1st Qu.:1.0000 1st Qu.:1.000 1st Qu.:1.000
## Median :2.00 Median :1.0000 Median :1.000 Median :2.000
## Mean :2.46 Mean :0.8659 Mean :1.586 Mean :2.615
## 3rd Qu.:3.00 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :5.00 Max. :1.0000 Max. :4.000 Max. :5.000
## NA's :525 NA's :525 NA's :1148 NA's :524
## current_contraceptive_use awareness_hiv_aids used_computer_tablet
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :1.0000 Median :1.0000 Median :0.0000
## Mean :0.5842 Mean :0.7975 Mean :0.3412
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :885 NA's :541 NA's :532
## used_internet owns_mobile_phone
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000
## Median :1.0000 Median :1.0000
## Mean :0.6394 Mean :0.8831
## 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000
## NA's :2501 NA's :529
Access to Media# Combine individual variables for access to the internet, phone, and computer into a single variable
# This new variable "access_to_media" will have a value of 1 if the respondent has access to any of these media sources, and 0 if not
# This provides a more comprehensive measure of media access
female_df <- female_df %>%
mutate(access_to_media = ifelse(used_computer_tablet == 1 | used_internet == 1 | owns_mobile_phone == 1, 1, 0))
# In later analysis, use access_to_media instead of the 3 separate variables
# Exporting female_df to a CSV file in the current working directory
#write.csv(female_df, "female_df.csv", row.names = FALSE)
# Histogram for 'Age at First Marriage'
afm_hist <- ggplot(female_df, aes(x = age_first_marriage)) +
geom_histogram(fill = "#F7C0C8", color = "black") +
theme_minimal() +
ggtitle("Histogram of Age at First Marriage") +
xlab("Age at First Marriage") +
ylab("Frequency")
print(afm_hist)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2026 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Before that, let's double check the count of missing values by column
count_missing <- sapply(female_df, function(x) sum(is.na(x)))
print(count_missing)
## age_first_marriage marital_status area
## 2026 18 11
## region education_level health_insurance
## 11 525 525
## ethnicity wealth_index current_contraceptive_use
## 1148 524 885
## awareness_hiv_aids used_computer_tablet used_internet
## 541 532 2501
## owns_mobile_phone access_to_media
## 529 529
# Make a copy of female_df for imputation
imputed_df <- female_df
# Handle missing values in 'Age at First Marriage'
# Calculate the median value for 'Age at First Marriage', excluding NA values
median_age_first_marriage <- median(imputed_df$age_first_marriage, na.rm = TRUE)
# Impute missing values in 'Age at First Marriage' with the median value
imputed_df$age_first_marriage[is.na(imputed_df$age_first_marriage)] <- median_age_first_marriage
# Define a function to calculate mode for categorical variables
getMode <- function(v) {
# The mode is the value that appears most frequently in the data
uniqv <- unique(na.omit(v)) # Omit NA values and get unique values
uniqv[which.max(tabulate(match(v, uniqv)))] # Return the value with the highest frequency
}
# For 'ethnicity', an ordinal variable with predefined categories, it makes sense to impute missing values with the mode.
mode_ethnicity <- getMode(imputed_df$ethnicity)
imputed_df <- mutate(imputed_df, ethnicity = ifelse(is.na(ethnicity), mode_ethnicity, ethnicity))
mode_area <- getMode(imputed_df$area)
imputed_df <- mutate(imputed_df, area = ifelse(is.na(area), mode_area, area))
mode_region <- getMode(imputed_df$region)
imputed_df <- mutate(imputed_df, region = ifelse(is.na(region), mode_region, region))
mode_marital_status <- getMode(imputed_df$marital_status)
imputed_df <- mutate(imputed_df, marital_status = ifelse(is.na(marital_status), mode_marital_status, marital_status))
# Binary variables like 'current_contraceptive_use', 'health_insurance','awareness_hiv_aids', 'used_internet', 'used_computer_tablet', 'owns_mobile_phone', and 'access_to_media'
# should be imputed with the mode since it represents the most frequent category (either 0 or 1).
# Calculate the mode for each binary variable
mode_used_internet <- getMode(imputed_df$used_internet)
mode_current_contraceptive_use <- getMode(imputed_df$current_contraceptive_use)
mode_health_insurance <- getMode(imputed_df$health_insurance)
mode_awareness_hiv_aids <- getMode(imputed_df$awareness_hiv_aids)
mode_used_computer_tablet <- getMode(imputed_df$used_computer_tablet)
mode_owns_mobile_phone <- getMode(imputed_df$owns_mobile_phone)
mode_access_to_media <- getMode(imputed_df$access_to_media)
# Impute missing values for binary variables
imputed_df <- mutate(imputed_df,
used_internet = ifelse(is.na(used_internet), mode_used_internet, used_internet),
current_contraceptive_use = ifelse(is.na(current_contraceptive_use), mode_current_contraceptive_use, current_contraceptive_use),
health_insurance = ifelse(is.na(health_insurance), mode_health_insurance, health_insurance),
awareness_hiv_aids = ifelse(is.na(awareness_hiv_aids), mode_awareness_hiv_aids, awareness_hiv_aids),
used_computer_tablet = ifelse(is.na(used_computer_tablet), mode_used_computer_tablet, used_computer_tablet),
owns_mobile_phone = ifelse(is.na(owns_mobile_phone), mode_owns_mobile_phone, owns_mobile_phone),
access_to_media = ifelse(is.na(access_to_media), mode_access_to_media, access_to_media)
)
# 'education_level' is an ordinal variable where the median could be a more suitable measure of central tendency than the mode.
# However, given the categorical nature of the levels (e.g., "Primary", "Secondary"), using the mode may still be appropriate.
mode_education_level <- getMode(imputed_df$education_level)
imputed_df <- mutate(imputed_df, education_level = ifelse(is.na(education_level), mode_education_level, education_level))
# 'wealth_index' is an ordinal variable where the median could be a more suitable measure of central tendency than the mode.
# However, given the categorical nature of the levels (e.g., "Poorest", "Poor",...), using the mode may still be appropriate.
mode_wealth_index <- getMode(imputed_df$wealth_index)
imputed_df <- mutate(imputed_df, wealth_index = ifelse(is.na(wealth_index), mode_wealth_index, wealth_index))
# Check the resulting dataset to confirm changes
summary(imputed_df)
## age_first_marriage marital_status area region
## Min. :10.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:18.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000
## Median :20.00 Median :1.000 Median :2.000 Median :3.000
## Mean :20.76 Mean :1.408 Mean :1.684 Mean :3.403
## 3rd Qu.:23.00 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:5.000
## Max. :47.00 Max. :3.000 Max. :2.000 Max. :6.000
## education_level health_insurance ethnicity wealth_index
## Min. :0.000 Min. :0.0000 Min. :1.000 Min. :1.00
## 1st Qu.:1.000 1st Qu.:1.0000 1st Qu.:1.000 1st Qu.:1.00
## Median :2.000 Median :1.0000 Median :1.000 Median :2.00
## Mean :2.438 Mean :0.8721 Mean :1.527 Mean :2.54
## 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:4.00
## Max. :5.000 Max. :1.0000 Max. :4.000 Max. :5.00
## current_contraceptive_use awareness_hiv_aids used_computer_tablet
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :1.0000 Median :1.0000 Median :0.0000
## Mean :0.6168 Mean :0.8072 Mean :0.3251
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## used_internet owns_mobile_phone access_to_media
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:1.0000
## Median :1.0000 Median :1.0000 Median :1.0000
## Mean :0.7192 Mean :0.8886 Mean :0.9071
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
# After imputation, let's check for missing values
count_imputation <- sapply(imputed_df, function(x) sum(is.na(x)))
print(count_imputation)
## age_first_marriage marital_status area
## 0 0 0
## region education_level health_insurance
## 0 0 0
## ethnicity wealth_index current_contraceptive_use
## 0 0 0
## awareness_hiv_aids used_computer_tablet used_internet
## 0 0 0
## owns_mobile_phone access_to_media
## 0 0
# Histogram for 'age_first_marriage' before imputation
afm_0 <- ggplot(female_df, aes(x = age_first_marriage)) +
geom_histogram(fill = "#F7C0C8", color = "black", bins = 30) +
theme_light() +
ggtitle("Before Imputation") +
xlab("Age at First Marriage") +
ylab("Frequency")
# Histogram for 'age_first_marriage' after imputation
afm_imputed <- ggplot(imputed_df, aes(x = age_first_marriage)) +
geom_histogram(fill = "#E83853", color = "black", bins = 30) +
theme_light() +
ggtitle("After Imputation") +
xlab("Age at First Marriage") +
ylab("Frequency")
# Arrange the two plots side by side
grid.arrange(afm_0, afm_imputed, ncol = 2)
## Warning: Removed 2026 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Arrange the two plots side by side and capture the layout as a grob
combined_plots <- arrangeGrob(afm_0, afm_imputed, ncol = 2)
## Warning: Removed 2026 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Now, use ggsave to save the combined plot
#ggsave("combined_age_first_marriage.png", plot = combined_plots, width = 10, height = 5)
# Convert "age at first marriage" into a binary variable to indicate child marriage
# Child marriage is defined as marriage before the age of 18
# The new binary variable "child_marriage" will have a value of 1 if the marriage occurred before age 18, and 0 otherwise
imputed_df <- imputed_df %>%
mutate(child_marriage = ifelse(age_first_marriage < 18, 1, 0))
# Create a binary variable for child marriage under 16
# The new variable "child_marriage_u16" will have a value of 1 if the marriage occurred before age 16, and 0 otherwise
imputed_df <- imputed_df %>%
mutate(child_marriage_u16 = ifelse(age_first_marriage < 16, 1, 0))
# Move "child_marriage" and "child_marriage_u16" to the front of the dataframe
imputed_df <- imputed_df %>%
select(child_marriage, child_marriage_u16, everything())
# Exporting female_df to a CSV file in the current working directory
#write.csv(imputed_df, "imputed_df.csv", row.names = FALSE)
# Aggregate the data by region to get the total number of child marriages under 18 per region
region_counts <- aggregate(child_marriage ~ region, data = imputed_df, FUN = sum)
# Calculate the total number of child marriages under 18 in the dataset
total_child_marriages <- sum(region_counts$child_marriage)
# Calculate the percentage for each region
region_counts$married_u18_perc_of_total <- (region_counts$child_marriage / total_child_marriages) * 100
# Mapping region numbers to names
region_names <- c("Red River Delta", "Northern Midlands And Mountain",
"North Central And Central Coastal", "Central Highlands",
"South East", "Mekong River Delta")
names(region_counts)[1] <- "region_name"
region_counts$region_name <- factor(region_counts$region_name, levels = 1:6, labels = region_names)
# Display the final data frame
print(region_counts)
## region_name child_marriage married_u18_perc_of_total
## 1 Red River Delta 142 7.226463
## 2 Northern Midlands And Mountain 888 45.190840
## 3 North Central And Central Coastal 215 10.941476
## 4 Central Highlands 271 13.791349
## 5 South East 194 9.872774
## 6 Mekong River Delta 255 12.977099
# Updated mapping including all provinces and cities in the Red River Delta
province_to_region <- data.frame(
NAME_1 = c(
'Bắc Ninh', 'Hà Nam', 'Hà Nội', 'Hải Dương', 'Hải Phòng', 'Hoà Bình', 'Hưng Yên', 'Nam Định', 'Ninh Bình', 'Thái Bình', 'Vĩnh Phúc', # Red River Delta 11
'Bắc Giang', 'Bắc Kạn', 'Cao Bằng', 'Hà Giang', 'Lạng Sơn', 'Lào Cai', 'Phú Thọ', 'Quảng Ninh', 'Thái Nguyên', 'Tuyên Quang', 'Yên Bái', 'Điện Biên', 'Lai Châu', 'Sơn La', # Northern Midlands And Mountain 14
'Bình Định', 'Bình Thuận', 'Khánh Hòa', 'Ninh Thuận', 'Phú Yên', 'Quảng Nam', 'Quảng Ngãi', 'Thừa Thiên Huế', 'Đà Nẵng', 'Hà Tĩnh', 'Nghệ An', 'Quảng Bình', 'Quảng Trị', 'Thanh Hóa', # North Central And Central Coastal 14
'Đắk Lắk', 'Đắk Nông', 'Gia Lai', 'Kon Tum', 'Lâm Đồng', # Central Highlands 5
'Bà Rịa - Vũng Tàu', 'Bình Dương', 'Bình Phước', 'Đồng Nai', 'Hồ Chí Minh', 'Tây Ninh', # South East 6
'An Giang', 'Bạc Liêu', 'Bến Tre', 'Cà Mau', 'Cần Thơ', 'Đồng Tháp', 'Hậu Giang', 'Kiên Giang', 'Long An', 'Sóc Trăng', 'Tiền Giang', 'Trà Vinh', 'Vĩnh Long' # Mekong River Delta 13
),
Region = c(
rep('Red River Delta', 11),
rep('Northern Midlands And Mountain', 14),
rep('North Central And Central Coastal', 14),
rep('Central Highlands', 5),
rep('South East', 6),
rep('Mekong River Delta', 13)
)
)
# Check the mapping
#print(province_to_region)
# Read shapefile
vietnam_shape <- st_read('/Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/gadm41_VNM_shp')
## Multiple layers are present in data source /Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/gadm41_VNM_shp, reading layer `gadm41_VNM_2'.
## Use `st_layers' to list all layer names and their type in a data source.
## Set the `layer' argument in `st_read' to read a particular layer.
## Reading layer `gadm41_VNM_2' from data source
## `/Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/gadm41_VNM_shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 710 features and 13 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 102.1446 ymin: 8.381355 xmax: 109.4692 ymax: 23.39269
## Geodetic CRS: WGS 84
# Join the shapefile with the province-to-region mapping
vietnam_shape_with_region <- vietnam_shape %>%
left_join(province_to_region, by = "NAME_1")
# Aggregate the shapefile data by region
vietnam_regions <- vietnam_shape_with_region %>%
group_by(Region) %>%
summarise(geometry = st_union(geometry), .groups = 'drop')
# Join the aggregated shapefile data with the child marriage data
vietnam_map_data <- vietnam_regions %>%
left_join(region_counts, by = c("Region" = "region_name"))
# Define colors with your specific choices
colors_ordered <- setNames(c("#B20033", "#CD0A25", "#E83853", "#EF7D8D", "#F7C0C8", "#FBE1E5"),
c("Northern Midlands And Mountain", "Central Highlands",
"Mekong River Delta", "North Central And Central Coastal",
"South East", "Red River Delta"))
# Plot
mapvn <- ggplot(data = vietnam_map_data) +
geom_sf(aes(fill = factor(Region, levels = names(colors_ordered))), color = NA) +
geom_sf_text(aes(label = sprintf("%.1f%%", married_u18_perc_of_total)), size = 4, hjust = 0.5, vjust = 0.5) +
scale_fill_manual(values = colors_ordered, name = "Region") +
labs(title = "Regional Contribution of Female Child Marriage Rates to Total Rates in Vietnam") +
theme_void() +
theme(legend.position = "right")
print(mapvn)
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
## give correct results for longitude/latitude data
#ggsave("mapvn.png", plot = mapvn, width = 8, height = 6, dpi = 300)
# Calculate total observations
total_obs <- nrow(imputed_df)
# Aggregate counts for each category by region without altering the original 'region' field
counts_df <- aggregate(cbind(ever_married = imputed_df$marital_status %in% c(1, 2),
married_u18 = imputed_df$child_marriage == 1,
married_u16 = imputed_df$child_marriage_u16 == 1) ~ region,
data = imputed_df,
FUN = sum)
# Convert counts to percentages
counts_df$ever_married <- (counts_df$ever_married / total_obs) * 100
counts_df$married_u18 <- (counts_df$married_u18 / total_obs) * 100
counts_df$married_u16 <- (counts_df$married_u16 / total_obs) * 100
p <- ggplot(counts_df, aes(x = factor(region))) +
geom_bar(aes(y = ever_married, fill = "Ever married"), stat = "identity") +
geom_bar(aes(y = married_u18, fill = "Married before 18 years old"), stat = "identity") +
geom_bar(aes(y = married_u16, fill = "Married before 16 years old"), stat = "identity") +
# Adding text labels for ever_married
geom_text(aes(y = ever_married, label = sprintf("%.1f%%", ever_married)),
position = position_stack(vjust = 1.025),
size = 4, color = "black") +
# Adding text labels for married_u18
geom_text(aes(y = married_u18, label = sprintf("%.1f%%", married_u18)),
position = position_stack(vjust = 1.05),
size = 4, color = "black") +
# Adding text labels for married_u16
geom_text(aes(y = married_u16, label = sprintf("%.1f%%", married_u16)),
position = position_stack(vjust = 1.1),
size = 4, color = "black") +
scale_fill_manual(values = c("Ever married" = "#FBE1E5",
"Married before 18 years old" = "#EF7D8D",
"Married before 16 years old" = "#B20016"),
name = "Marital Status") +
labs(x = "Region", y = "Percentage", title = "Marital Status Among Female Respondants by Region") +
theme_minimal() +
theme(plot.margin = margin(t = 10, r = 10, b = 10, l = 10, unit = "mm"),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "right") +
scale_x_discrete(labels = c("1" = "Red River Delta", "2" = "Northern Midlands And Mountain",
"3" = "North Central And Central Coastal", "4" = "Central Highlands",
"5" = "South East", "6" = "Mekong River Delta"))
# Convert ggplot object to plotly for interactive visualization
ggplotly(p)
# Calculate correlation matrix
cor_matrix <- cor(imputed_df %>% select_if(is.numeric), use = "complete.obs")
# Melt the correlation matrix
melted_cor_matrix <- melt(cor_matrix)
# Generate an interactive heatmap
corr_matrix <- ggplot(melted_cor_matrix, aes(Var1, Var2, fill = value)) +
geom_tile() +
scale_fill_gradientn(
colours = c("deepskyblue", "white", "#CD0A25"),
values = scales::rescale(c(-1, 0, 1)),
limits = c(-1, 1),
name="Pearson\nCorrelation"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("") +
ylab("") +
ggtitle("Correlation Matrix")
# Convert ggplot object to plotly for interactivity
ggplotly(corr_matrix)
# Convert nominal and ordinal variables to factors
imputed_df$area <- as.factor(imputed_df$area)
imputed_df$region <- as.factor(imputed_df$region)
imputed_df$education_level <- factor(imputed_df$education_level, ordered = FALSE)
imputed_df$ethnicity <- as.factor(imputed_df$ethnicity)
imputed_df$wealth_index <- factor(imputed_df$wealth_index, ordered = FALSE)
# Binary variables are already in the correct format and can be used as is
# Baseline model for reference
baseline_model <- glm(child_marriage ~ area + education_level + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media, family = binomial(), data = imputed_df)
# Summarize the baseline model
summary(baseline_model)
##
## Call:
## glm(formula = child_marriage ~ area + education_level + wealth_index +
## health_insurance + current_contraceptive_use + awareness_hiv_aids +
## access_to_media, family = binomial(), data = imputed_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.73391 0.13169 -5.573 2.50e-08 ***
## area2 0.45543 0.07999 5.693 1.25e-08 ***
## education_level1 -0.41350 0.08705 -4.750 2.03e-06 ***
## education_level2 -0.58693 0.08518 -6.890 5.57e-12 ***
## education_level3 -1.52807 0.11502 -13.286 < 2e-16 ***
## education_level4 -3.24812 0.51198 -6.344 2.24e-10 ***
## education_level5 -3.19482 0.25323 -12.616 < 2e-16 ***
## wealth_index2 -0.46183 0.07994 -5.777 7.61e-09 ***
## wealth_index3 -0.80552 0.09964 -8.085 6.24e-16 ***
## wealth_index4 -0.61389 0.10926 -5.618 1.93e-08 ***
## wealth_index5 -0.89341 0.14898 -5.997 2.01e-09 ***
## health_insurance 0.06907 0.08094 0.853 0.393
## current_contraceptive_use 0.40123 0.06065 6.615 3.71e-11 ***
## awareness_hiv_aids -0.47875 0.06960 -6.879 6.05e-12 ***
## access_to_media -0.01461 0.08121 -0.180 0.857
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10439.1 on 11293 degrees of freedom
## Residual deviance: 8493.7 on 11279 degrees of freedom
## AIC: 8523.7
##
## Number of Fisher Scoring iterations: 7
# Calculate VIF (A VIF value > 5 indicates high multicollinearity)
base_vif_results <- vif(baseline_model)
print(base_vif_results)
## GVIF Df GVIF^(1/(2*Df))
## area 1.110732 1 1.053913
## education_level 1.707863 5 1.054983
## wealth_index 1.401192 4 1.043067
## health_insurance 1.021120 1 1.010505
## current_contraceptive_use 1.034853 1 1.017277
## awareness_hiv_aids 1.500511 1 1.224954
## access_to_media 1.268704 1 1.126367
# Check if any VIF value is greater than a typical threshold, like 5 or 10.
base_high_vif <- base_vif_results[base_vif_results > 5]
print(base_high_vif)
## numeric(0)
# Enhanced model with additional fixed effects
enhanced_model <- glm(child_marriage ~ area + region + education_level + ethnicity + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media,
family = binomial(),
data = imputed_df)
# Summarize the new model with FEs
summary(enhanced_model)
##
## Call:
## glm(formula = child_marriage ~ area + region + education_level +
## ethnicity + wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media, family = binomial(),
## data = imputed_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.54522 0.17906 -8.630 < 2e-16 ***
## area2 0.28689 0.08488 3.380 0.000725 ***
## region2 0.37758 0.12919 2.923 0.003469 **
## region3 0.18170 0.12745 1.426 0.153967
## region4 0.34057 0.12599 2.703 0.006869 **
## region5 -0.05340 0.12612 -0.423 0.672028
## region6 -0.05277 0.13336 -0.396 0.692301
## education_level1 -0.05925 0.09353 -0.634 0.526382
## education_level2 -0.22571 0.09227 -2.446 0.014438 *
## education_level3 -1.17196 0.12172 -9.629 < 2e-16 ***
## education_level4 -2.96956 0.51385 -5.779 7.51e-09 ***
## education_level5 -2.89574 0.25599 -11.312 < 2e-16 ***
## ethnicity2 0.07771 0.10592 0.734 0.463184
## ethnicity3 0.27807 0.12830 2.167 0.030210 *
## ethnicity4 1.15504 0.10626 10.870 < 2e-16 ***
## wealth_index2 -0.12498 0.08523 -1.466 0.142533
## wealth_index3 -0.46330 0.10565 -4.385 1.16e-05 ***
## wealth_index4 -0.26442 0.11695 -2.261 0.023760 *
## wealth_index5 -0.54064 0.16030 -3.373 0.000744 ***
## health_insurance -0.05157 0.08277 -0.623 0.533244
## current_contraceptive_use 0.48480 0.06265 7.739 1.00e-14 ***
## awareness_hiv_aids -0.19387 0.07592 -2.554 0.010661 *
## access_to_media -0.07880 0.08517 -0.925 0.354845
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10439.1 on 11293 degrees of freedom
## Residual deviance: 8240.2 on 11271 degrees of freedom
## AIC: 8286.2
##
## Number of Fisher Scoring iterations: 7
# Calculate VIF (A VIF value > 5 indicates high multicollinearity)
FE_vif_results <- vif(enhanced_model)
print(FE_vif_results)
## GVIF Df GVIF^(1/(2*Df))
## area 1.254248 1 1.119932
## region 4.916245 5 1.172636
## education_level 1.956030 5 1.069394
## ethnicity 4.235677 3 1.272000
## wealth_index 1.934925 4 1.086008
## health_insurance 1.051524 1 1.025438
## current_contraceptive_use 1.049061 1 1.024237
## awareness_hiv_aids 1.679129 1 1.295812
## access_to_media 1.312005 1 1.145428
# Check if any VIF value is greater than a typical threshold, like 5.
FE_high_vif <- FE_vif_results[FE_vif_results > 5]
print(FE_high_vif)
## numeric(0)
No VIF is significant larger than 5. This suggests that the independent variables in my model with Fixed Effects do not suffer from severe multicollinearity.
# Model comparison using AIC
aic_base <- AIC(baseline_model)
aic_enhanced <- AIC(enhanced_model)
cat("AIC - Base Model:", aic_base, "\n")
## AIC - Base Model: 8523.675
cat("AIC - Enhanced Model:", aic_enhanced, "\n")
## AIC - Enhanced Model: 8286.168
Additionally, the lower AIC for the model with two fixed effects compared to the baseline model suggests that adding these fixed effects improves the model’s overall fit. This aligns with my theoretical justification for including region and ethnicity as relevant factors in predicting child marriage.
# Model comparison using BIC
bic_base <- BIC(baseline_model)
bic_enhanced <- BIC(enhanced_model)
cat("BIC - Base Model:", bic_base, "\n")
## BIC - Base Model: 8633.655
cat("BIC - Enhanced Model:", bic_enhanced, "\n")
## BIC - Enhanced Model: 8454.805
Lower BIC values suggest that, despite the added complexity (more parameters), the model with both region and ethnicity provides a better overall fit to my data when adjusted for the number of predictors. This is a strong indication that these variables are meaningful in explaining the variance in child marriage occurrences in my dataset.
# Adding the interaction term between education level and wealth index
model_interaction <- update(enhanced_model, . ~ . + area:wealth_index)
summary(model_interaction)
##
## Call:
## glm(formula = child_marriage ~ area + region + education_level +
## ethnicity + wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media + area:wealth_index,
## family = binomial(), data = imputed_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.78525 0.21262 -8.396 < 2e-16 ***
## area2 0.54558 0.15257 3.576 0.000349 ***
## region2 0.36523 0.13009 2.807 0.004994 **
## region3 0.16778 0.12857 1.305 0.191877
## region4 0.32337 0.12701 2.546 0.010892 *
## region5 -0.06803 0.12726 -0.535 0.592956
## region6 -0.06059 0.13410 -0.452 0.651371
## education_level1 -0.06117 0.09352 -0.654 0.513085
## education_level2 -0.21895 0.09231 -2.372 0.017701 *
## education_level3 -1.17282 0.12176 -9.632 < 2e-16 ***
## education_level4 -2.96880 0.51416 -5.774 7.74e-09 ***
## education_level5 -2.89550 0.25820 -11.214 < 2e-16 ***
## ethnicity2 0.06171 0.10624 0.581 0.561352
## ethnicity3 0.27262 0.12824 2.126 0.033518 *
## ethnicity4 1.12852 0.10687 10.559 < 2e-16 ***
## wealth_index2 0.26715 0.19445 1.374 0.169473
## wealth_index3 -0.15465 0.21122 -0.732 0.464075
## wealth_index4 -0.04152 0.22195 -0.187 0.851622
## wealth_index5 -0.32837 0.26777 -1.226 0.220078
## health_insurance -0.04238 0.08289 -0.511 0.609164
## current_contraceptive_use 0.49215 0.06274 7.844 4.35e-15 ***
## awareness_hiv_aids -0.18696 0.07599 -2.460 0.013883 *
## access_to_media -0.07199 0.08522 -0.845 0.398196
## area2:wealth_index2 -0.47937 0.21523 -2.227 0.025928 *
## area2:wealth_index3 -0.38304 0.24189 -1.584 0.113299
## area2:wealth_index4 -0.26224 0.25631 -1.023 0.306247
## area2:wealth_index5 -0.25049 0.31993 -0.783 0.433646
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10439.1 on 11293 degrees of freedom
## Residual deviance: 8234.7 on 11267 degrees of freedom
## AIC: 8288.7
##
## Number of Fisher Scoring iterations: 7
cat("AIC (enhanced):", AIC(enhanced_model), "\nAIC (w/ interaction terms):", AIC(model_interaction), "\n")
## AIC (enhanced): 8286.168
## AIC (w/ interaction terms): 8288.662
cat("BIC (enhanced):", BIC(enhanced_model), "\nBIC (w/ interaction terms):", BIC(model_interaction), "\n")
## BIC (enhanced): 8454.805
## BIC (w/ interaction terms): 8486.627
enhanced_model and model_interaction –
indicate that the simpler model (without interaction terms) has a better
trade-off between model fit and complexity.model_interaction. This suggests that, despite possibly
having a good fit, the addition of interaction terms to the
model_interaction does not improve the model’s performance
enough to justify the added complexity. A lower AIC value is generally
preferred, implying that the enhanced_model is a better
model choice from an information criterion standpoint.enhanced_model. BIC, which includes a stricter penalty for
the number of parameters, reinforces the conclusion drawn from the AIC:
the additional complexity introduced by interaction terms does not lead
to a proportional improvement in the model fit. BIC being more
stringent, particularly in models with larger sample sizes and more
parameters, suggests a strong preference for the simpler
enhanced_model.enhanced_model as a more optimal balance between goodness
of fit and parsimony. This indicates that, while interaction terms add
detail and complexity to the model, they might not be contributing
significantly to explaining the variability in your dependent
variable.# Function to add significance asterisks
add_asterisks <- function(p_value) {
if (is.na(p_value)) {
return(NA)
} else if (p_value < 0.001) {
return("***")
} else if (p_value < 0.01) {
return("**")
} else if (p_value < 0.05) {
return("*")
} else {
return("")
}
}
# Function to format confidence intervals as a string
format_ci <- function(lower, upper) {
paste0("(", round(lower, 2), ", ", round(upper, 2), ")")
}
# Tidy the baseline model with confidence intervals
tidy_baseline <- tidy(baseline_model, conf.int = TRUE, exponentiate = TRUE)
# Tidy the enhanced model with confidence intervals
tidy_enhanced <- tidy(enhanced_model, conf.int = TRUE, exponentiate = TRUE)
# Tidy the interaction model with confidence intervals
tidy_interaction <- tidy(model_interaction, conf.int = TRUE, exponentiate = TRUE)
# Apply the function to each model's p.value
tidy_baseline$asterisks <- sapply(tidy_baseline$p.value, add_asterisks)
tidy_enhanced$asterisks <- sapply(tidy_enhanced$p.value, add_asterisks)
tidy_interaction$asterisks <- sapply(tidy_interaction$p.value, add_asterisks)
# Create OR strings with asterisks and format CIs as a string
tidy_baseline <- tidy_baseline %>%
mutate(
OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
)
tidy_enhanced <- tidy_enhanced %>%
mutate(
OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
)
tidy_interaction <- tidy_interaction %>%
mutate(
OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
)
# Add a 'Model' column to each tidied dataframe
tidy_baseline <- tidy_baseline %>% mutate(Model = "Baseline")
tidy_enhanced <- tidy_enhanced %>% mutate(Model = "Enhanced")
tidy_interaction <- tidy_interaction %>% mutate(Model = "Interaction")
# Combine and pivot the dataframes
combined_results <- bind_rows(
tidy_baseline %>% select(term, OR, CI, Model),
tidy_enhanced %>% select(term, OR, CI, Model),
tidy_interaction %>% select(term, OR, CI, Model)
) %>%
pivot_wider(names_from = Model, values_from = c(OR, CI))
# Replace NAs with "—"
combined_results[is.na(combined_results)] <- "—"
# Reordering columns to have OR and CI next to each other for each model
combined_results <- combined_results %>%
select(term,
OR_Baseline, CI_Baseline,
OR_Enhanced, CI_Enhanced,
OR_Interaction, CI_Interaction)
# Print the final combined table
print(combined_results)
## # A tibble: 27 × 7
## term OR_Baseline CI_Baseline OR_Enhanced CI_Enhanced OR_Interaction
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 (Intercept) 0.48*** (0.37, 0.6… 0.21*** (0.15, 0.3) 0.17***
## 2 area2 1.58*** (1.35, 1.8… 1.33*** (1.13, 1.5… 1.73***
## 3 education_lev… 0.66*** (0.56, 0.7… 0.94 (0.78, 1.1… 0.94
## 4 education_lev… 0.56*** (0.47, 0.6… 0.8* (0.67, 0.9… 0.8*
## 5 education_lev… 0.22*** (0.17, 0.2… 0.31*** (0.24, 0.3… 0.31***
## 6 education_lev… 0.04*** (0.01, 0.0… 0.05*** (0.02, 0.1… 0.05***
## 7 education_lev… 0.04*** (0.02, 0.0… 0.06*** (0.03, 0.0… 0.06***
## 8 wealth_index2 0.63*** (0.54, 0.7… 0.88 (0.75, 1.0… 1.31
## 9 wealth_index3 0.45*** (0.37, 0.5… 0.63*** (0.51, 0.7… 0.86
## 10 wealth_index4 0.54*** (0.44, 0.6… 0.77* (0.61, 0.9… 0.96
## # ℹ 17 more rows
## # ℹ 1 more variable: CI_Interaction <chr>
# 1. Hosmer-Lemeshow Test for the Baseline Model
hoslem.test(baseline_model$y, fitted(baseline_model), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: baseline_model$y, fitted(baseline_model)
## X-squared = 10.565, df = 8, p-value = 0.2276
# 2. Hosmer-Lemeshow Test for the Enhanced Model (Baseline + Fixed Effects)
hoslem.test(enhanced_model$y, fitted(enhanced_model), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: enhanced_model$y, fitted(enhanced_model)
## X-squared = 10.538, df = 8, p-value = 0.2293
# 3. Hosmer-Lemeshow Test for the Interaction Model (Baseline + Fixed Effects + Interaction Terms)
hoslem.test(model_interaction$y, fitted(model_interaction), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: model_interaction$y, fitted(model_interaction)
## X-squared = 10.272, df = 8, p-value = 0.2465
A large p-value (>0.05) indicates a good fit, meaning that there’s no significant difference between the observed and predicted values. Through each model, the p-value increases which suggests that our decision to include fixed effects and interaction terms are significant.
# Baseline vs. Baseline + Fixed Effects
anova(baseline_model, enhanced_model, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: child_marriage ~ area + education_level + wealth_index + health_insurance +
## current_contraceptive_use + awareness_hiv_aids + access_to_media
## Model 2: child_marriage ~ area + region + education_level + ethnicity +
## wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 11279 8493.7
## 2 11271 8240.2 8 253.51 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The Enhanced Model provides a significantly better fit to the data compared to the Baseline Model, as indicated by the large decrease in residual deviance and the very small p-value (< 2.2e-16).
# Baseline + Fixed Effects vs. Baseline + Fixed Effects + Interaction Terms
anova(enhanced_model, model_interaction, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: child_marriage ~ area + region + education_level + ethnicity +
## wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media
## Model 2: child_marriage ~ area + region + education_level + ethnicity +
## wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media + area:wealth_index
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 11271 8240.2
## 2 11267 8234.7 4 5.506 0.2392
Adding the interaction terms between area and wealth_index does not significantly improve the model fit compared to the Enhanced Model without interaction terms. This is indicated by the relatively high p-value and the smaller decrease in residual deviance.
# For each model
roc_response_baseline <- roc(imputed_df$child_marriage, fitted(baseline_model))
auc_baseline <- auc(roc_response_baseline)
roc_response_enhanced <- roc(imputed_df$child_marriage, fitted(enhanced_model))
auc_enhanced <- auc(roc_response_enhanced)
roc_response_interaction <- roc(imputed_df$child_marriage, fitted(model_interaction))
auc_interaction <- auc(roc_response_interaction)
# Compare AUC values
print(paste("AUC Baseline Model:", auc_baseline))
## [1] "AUC Baseline Model: 0.794541604239919"
print(paste("AUC Enhanced Model:", auc_enhanced))
## [1] "AUC Enhanced Model: 0.809330968003957"
print(paste("AUC Interaction Model:", auc_interaction))
## [1] "AUC Interaction Model: 0.80987822317723"
The AUC value is close to 0.8, which indicates that the Baseline Model has good discriminative ability. In other words, it is capable of distinguishing between cases and controls with a high degree of accuracy. An AUC of 0.5 represents a model with no discriminative ability (akin to random guessing), while an AUC of 1.0 represents perfect discrimination. So, my model is performing substantially better than random guessing.
This model shows a slight improvement in AUC over the Baseline Model. The increase suggests that the additional variables (or adjustments) you made in the Enhanced Model contribute positively to its ability to differentiate between cases and controls. The difference in AUC between the Baseline and Enhanced models, while modest, is still meaningful, especially in practical, real-world contexts.
The Interaction Model shows a very slight improvement in AUC over the Enhanced Model. This indicates that adding interaction terms provides a marginal improvement in the model’s discriminatory power. However, the improvement is very minimal, which aligns with my earlier findings that the interaction terms did not significantly improve the model fit.
All models demonstrate good ability to distinguish between cases and controls. An AUC greater than 0.7 is generally considered acceptable, and my models are around or above 0.8. The Enhanced and Interaction Models only show marginal improvements in AUC compared to the Baseline Model. This suggests that while the additional complexity (more variables and interaction terms) does contribute slightly to model performance, the gains are not substantial. Given the slight increases in AUC with added model complexity, consider the trade-offs. A simpler model might be preferable if it is easier to interpret and communicate, especially if the increase in predictive power is minimal. Personally, given this result, I think Enhanced Model might be a better-suited model overall.
# No imputation -- Remove rows with any missing data from 'female_df'
og_df <- na.omit(female_df)
head(og_df)
## # A tibble: 6 × 14
## age_first_marriage marital_status area region education_level
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 17 1 1 1 2
## 2 22 1 1 1 5
## 3 26 1 1 1 5
## 4 21 1 1 1 3
## 5 18 1 1 1 1
## 6 22 1 1 1 4
## # ℹ 9 more variables: health_insurance <dbl>, ethnicity <dbl>,
## # wealth_index <dbl>, current_contraceptive_use <dbl>,
## # awareness_hiv_aids <dbl>, used_computer_tablet <dbl>, used_internet <dbl>,
## # owns_mobile_phone <dbl>, access_to_media <dbl>
# Convert "age at first marriage" into a binary variable to indicate child marriage
# Child marriage is defined as marriage before the age of 18
# The new binary variable "child_marriage" will have a value of 1 if the marriage occurred before age 18, and 0 otherwise
og_df <- og_df %>%
mutate(child_marriage = ifelse(age_first_marriage < 18, 1, 0))
# Create a binary variable for child marriage under 16
# The new variable "child_marriage_u16" will have a value of 1 if the marriage occurred before age 16, and 0 otherwise
og_df <- og_df %>%
mutate(child_marriage_u16 = ifelse(age_first_marriage < 16, 1, 0))
# Move "child_marriage" and "child_marriage_u16" to the front of the dataframe
og_df <- og_df %>%
select(child_marriage, child_marriage_u16, everything())
# Convert nominal and ordinal variables to factors
og_df$area <- as.factor(og_df$area)
og_df$region <- as.factor(og_df$region)
og_df$education_level <- factor(og_df$education_level, ordered = FALSE)
og_df$ethnicity <- as.factor(og_df$ethnicity)
og_df$wealth_index <- factor(og_df$wealth_index, ordered = FALSE)
# Binary variables are already in the correct format and can be used as is
# Base model
base_model_no_impute <- glm(child_marriage ~ area + education_level + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media, family = binomial(), data = og_df)
# Summarize the base model
summary(base_model_no_impute)
##
## Call:
## glm(formula = child_marriage ~ area + education_level + wealth_index +
## health_insurance + current_contraceptive_use + awareness_hiv_aids +
## access_to_media, family = binomial(), data = og_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.12852 0.15839 -0.811 0.41714
## area2 0.27195 0.09515 2.858 0.00426 **
## education_level1 -0.44184 0.10273 -4.301 1.70e-05 ***
## education_level2 -0.51162 0.10336 -4.950 7.43e-07 ***
## education_level3 -1.27268 0.13835 -9.199 < 2e-16 ***
## education_level4 -3.39394 0.71961 -4.716 2.40e-06 ***
## education_level5 -3.17824 0.46414 -6.848 7.51e-12 ***
## wealth_index2 -0.61389 0.09307 -6.596 4.22e-11 ***
## wealth_index3 -0.92891 0.11199 -8.295 < 2e-16 ***
## wealth_index4 -0.75155 0.12357 -6.082 1.19e-09 ***
## wealth_index5 -1.02338 0.16941 -6.041 1.53e-09 ***
## health_insurance 0.10983 0.09266 1.185 0.23587
## current_contraceptive_use -0.10949 0.07039 -1.555 0.11984
## awareness_hiv_aids -0.46608 0.08253 -5.647 1.63e-08 ***
## access_to_media 0.01062 0.10367 0.102 0.91843
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6819.3 on 6443 degrees of freedom
## Residual deviance: 5889.2 on 6429 degrees of freedom
## AIC: 5919.2
##
## Number of Fisher Scoring iterations: 7
# Enhanced model with additional fixed effects
enhanced_model_no_impute <- glm(child_marriage ~ area + region + education_level + ethnicity + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media, family = binomial(), data = og_df)
# Summarize the new model with FEs
summary(enhanced_model_no_impute)
##
## Call:
## glm(formula = child_marriage ~ area + region + education_level +
## ethnicity + wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media, family = binomial(),
## data = og_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.70949 0.23279 -7.343 2.08e-13 ***
## area2 0.17214 0.10044 1.714 0.08654 .
## region2 0.10548 0.16065 0.657 0.51143
## region3 0.03594 0.15601 0.230 0.81778
## region4 -0.10432 0.17061 -0.611 0.54090
## region5 0.06092 0.14434 0.422 0.67298
## region6 0.10315 0.14996 0.688 0.49156
## education_level1 0.12519 0.11676 1.072 0.28362
## education_level2 0.07838 0.11878 0.660 0.50933
## education_level3 -0.71860 0.15086 -4.763 1.91e-06 ***
## education_level4 -2.82957 0.72295 -3.914 9.08e-05 ***
## education_level5 -2.61912 0.46855 -5.590 2.27e-08 ***
## ethnicity2 0.55070 0.13875 3.969 7.21e-05 ***
## ethnicity3 0.33886 0.14008 2.419 0.01556 *
## ethnicity4 1.90113 0.16173 11.755 < 2e-16 ***
## wealth_index2 -0.05571 0.10820 -0.515 0.60665
## wealth_index3 -0.34294 0.12995 -2.639 0.00831 **
## wealth_index4 -0.14989 0.14419 -1.040 0.29857
## wealth_index5 -0.41552 0.19232 -2.161 0.03073 *
## health_insurance -0.07933 0.09623 -0.824 0.40971
## current_contraceptive_use -0.02792 0.07341 -0.380 0.70370
## awareness_hiv_aids -0.08636 0.09460 -0.913 0.36132
## access_to_media 0.17785 0.11026 1.613 0.10674
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6819.3 on 6443 degrees of freedom
## Residual deviance: 5658.7 on 6421 degrees of freedom
## AIC: 5704.7
##
## Number of Fisher Scoring iterations: 7
# Model comparison using AIC
aic_base <- AIC(base_model_no_impute)
aic_enhanced <- AIC(enhanced_model_no_impute)
cat("AIC - Base Model:", aic_base, "\n")
## AIC - Base Model: 5919.176
cat("AIC - Enhanced Model:", aic_enhanced, "\n")
## AIC - Enhanced Model: 5704.682
Additionally, the lower AIC for the model with two fixed effects compared to the baseline model suggests that adding these fixed effects improves the model’s overall fit. This aligns with my theoretical justification for including region and ethnicity as relevant factors in predicting child marriage.
# Model comparison using BIC
bic_base <- BIC(base_model_no_impute)
bic_enhanced <- BIC(enhanced_model_no_impute)
cat("BIC - Base Model:", bic_base, "\n")
## BIC - Base Model: 6020.739
cat("BIC - Enhanced Model:", bic_enhanced, "\n")
## BIC - Enhanced Model: 5860.413
Lower BIC values suggest that, despite the added complexity (more parameters), the model with both region and ethnicity provides a better overall fit to my data when adjusted for the number of predictors. This is a strong indication that these variables are meaningful in explaining the variance in child marriage occurrences in my dataset.
# Adding the interaction term between education level and wealth index
interaction_model_no_impute <- update(enhanced_model_no_impute, . ~ . + area:wealth_index)
summary(interaction_model_no_impute)
##
## Call:
## glm(formula = child_marriage ~ area + region + education_level +
## ethnicity + wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media + area:wealth_index,
## family = binomial(), data = og_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.71245 0.30284 -5.655 1.56e-08 ***
## area2 0.15095 0.24236 0.623 0.53341
## region2 0.12616 0.16152 0.781 0.43477
## region3 0.05816 0.15716 0.370 0.71134
## region4 -0.08816 0.17115 -0.515 0.60647
## region5 0.08191 0.14672 0.558 0.57664
## region6 0.12494 0.15100 0.827 0.40799
## education_level1 0.12367 0.11674 1.059 0.28943
## education_level2 0.07710 0.11879 0.649 0.51630
## education_level3 -0.71184 0.15091 -4.717 2.39e-06 ***
## education_level4 -2.79700 0.72341 -3.866 0.00011 ***
## education_level5 -2.58385 0.46919 -5.507 3.65e-08 ***
## ethnicity2 0.55095 0.13897 3.965 7.35e-05 ***
## ethnicity3 0.33370 0.14008 2.382 0.01721 *
## ethnicity4 1.90109 0.16307 11.658 < 2e-16 ***
## wealth_index2 0.02709 0.27554 0.098 0.92168
## wealth_index3 -0.35431 0.28892 -1.226 0.22007
## wealth_index4 -0.24012 0.30088 -0.798 0.42484
## wealth_index5 -0.66376 0.37279 -1.781 0.07499 .
## health_insurance -0.07528 0.09636 -0.781 0.43465
## current_contraceptive_use -0.02755 0.07345 -0.375 0.70764
## awareness_hiv_aids -0.08558 0.09457 -0.905 0.36551
## access_to_media 0.17709 0.11027 1.606 0.10827
## area2:wealth_index2 -0.11570 0.29382 -0.394 0.69374
## area2:wealth_index3 0.01145 0.31297 0.037 0.97081
## area2:wealth_index4 0.13487 0.32848 0.411 0.68137
## area2:wealth_index5 0.37698 0.41566 0.907 0.36443
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6819.3 on 6443 degrees of freedom
## Residual deviance: 5656.6 on 6417 degrees of freedom
## AIC: 5710.6
##
## Number of Fisher Scoring iterations: 7
cat("AIC (enhanced):", AIC(enhanced_model_no_impute), "\nAIC (w/ interaction terms):", AIC(interaction_model_no_impute), "\n")
## AIC (enhanced): 5704.682
## AIC (w/ interaction terms): 5710.578
cat("BIC (enhanced):", BIC(enhanced_model_no_impute), "\nBIC (w/ interaction terms):", BIC(interaction_model_no_impute), "\n")
## BIC (enhanced): 5860.413
## BIC (w/ interaction terms): 5893.392
enhanced_model_no_impute and
interaction_model_no_impute – indicate that the simpler
model (without interaction terms) has a better trade-off between model
fit and complexity.interaction_model_no_impute. This suggests that, despite
possibly having a good fit, the addition of interaction terms to the
interaction_model_no_impute does not improve the model’s
performance enough to justify the added complexity. A lower AIC value is
generally preferred, implying that the
enhanced_model_no_impute is a better model choice from an
information criterion standpoint.enhanced_model_no_impute. BIC, which includes a stricter
penalty for the number of parameters, reinforces the conclusion drawn
from the AIC: the additional complexity introduced by interaction terms
does not lead to a proportional improvement in the model fit. BIC being
more stringent, particularly in models with larger sample sizes and more
parameters, suggests a strong preference for the simpler
enhanced_model_no_impute.enhanced_model_no_impute as a more optimal balance between
goodness of fit and parsimony. This indicates that, while interaction
terms add detail and complexity to the model, they might not be
contributing significantly to explaining the variability in your
dependent variable.# VIFs check (A VIF value > 5 indicates high multicollinearity)
# Base model
base_no_impute_vif <- vif(base_model_no_impute)
print(base_no_impute_vif)
## GVIF Df GVIF^(1/(2*Df))
## area 1.161113 1 1.077549
## education_level 1.635562 5 1.050429
## wealth_index 1.521114 4 1.053829
## health_insurance 1.028226 1 1.014015
## current_contraceptive_use 1.009489 1 1.004733
## awareness_hiv_aids 1.471075 1 1.212879
## access_to_media 1.212302 1 1.101046
# Enhanced model
enhanced_no_impute_vif <- vif(enhanced_model_no_impute)
print(enhanced_no_impute_vif)
## GVIF Df GVIF^(1/(2*Df))
## area 1.295560 1 1.138227
## region 5.816954 5 1.192531
## education_level 2.116453 5 1.077856
## ethnicity 7.824091 3 1.408983
## wealth_index 2.687468 4 1.131535
## health_insurance 1.087063 1 1.042623
## current_contraceptive_use 1.025473 1 1.012656
## awareness_hiv_aids 1.772812 1 1.331470
## access_to_media 1.256323 1 1.120858
# Interacton model
interaction_no_impute_vif <- vif(interaction_model_no_impute)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
print(interaction_no_impute_vif)
## GVIF Df GVIF^(1/(2*Df))
## area 7.510342 1 2.740500
## region 6.056756 5 1.197358
## education_level 2.125445 5 1.078313
## ethnicity 7.978570 3 1.413581
## wealth_index 497.956732 4 2.173446
## health_insurance 1.090010 1 1.044035
## current_contraceptive_use 1.026554 1 1.013190
## awareness_hiv_aids 1.771947 1 1.331145
## access_to_media 1.256367 1 1.120878
## area:wealth_index 466.507434 4 2.155794
wealth_index and
area:wealth_index, their VIF values are significantly
higher (around 2.17 and 2.16, respectively). These values suggest a
considerable level of multicollinearity, which is often expected when
including interaction terms in a model.# Extracting data from the models
extract_model_data <- function(model) {
model_summary <- summary(model)
coeffs <- model_summary$coefficients
data.frame(
Term = rownames(coeffs),
Estimate = sprintf("%.3f", coeffs[, "Estimate"]),
pValue = ifelse(coeffs[, "Pr(>|z|)"] < 0.001,
format(coeffs[, "Pr(>|z|)"], scientific = TRUE),
sprintf("%.3f", coeffs[, "Pr(>|z|)"])),
Significance = sapply(coeffs[, "Pr(>|z|)"], add_asterisks)
)
}
# Applying the function to each model
base_impute_data <- extract_model_data(baseline_model)
base_no_impute_data <- extract_model_data(base_model_no_impute)
# Combine the data for comparison
base_comparison_data <- merge(base_impute_data, base_no_impute_data, by = "Term", suffixes = c("_Impute", "_NoImpute"), sort = FALSE)
# View the comparison
print(base_comparison_data)
## Term Estimate_Impute pValue_Impute Significance_Impute
## 1 (Intercept) -0.734 2.502722e-08 ***
## 2 area2 0.455 1.245520e-08 ***
## 3 education_level1 -0.413 2.032253e-06 ***
## 4 education_level2 -0.587 5.570356e-12 ***
## 5 education_level3 -1.528 2.806068e-40 ***
## 6 education_level4 -3.248 2.235120e-10 ***
## 7 education_level5 -3.195 1.720431e-36 ***
## 8 wealth_index2 -0.462 7.607170e-09 ***
## 9 wealth_index3 -0.806 6.235617e-16 ***
## 10 wealth_index4 -0.614 1.926274e-08 ***
## 11 wealth_index5 -0.893 2.012553e-09 ***
## 12 health_insurance 0.069 0.393
## 13 current_contraceptive_use 0.401 3.710837e-11 ***
## 14 awareness_hiv_aids -0.479 6.047566e-12 ***
## 15 access_to_media -0.015 0.857
## Estimate_NoImpute pValue_NoImpute Significance_NoImpute
## 1 -0.129 0.417
## 2 0.272 0.004 **
## 3 -0.442 1.701057e-05 ***
## 4 -0.512 7.430972e-07 ***
## 5 -1.273 3.606682e-20 ***
## 6 -3.394 2.400883e-06 ***
## 7 -3.178 7.512748e-12 ***
## 8 -0.614 4.224206e-11 ***
## 9 -0.929 1.089044e-16 ***
## 10 -0.752 1.186365e-09 ***
## 11 -1.023 1.532441e-09 ***
## 12 0.110 0.236
## 13 -0.109 0.120
## 14 -0.466 1.630404e-08 ***
## 15 0.011 0.918
# Applying the function to each model
enhanced_impute_data <- extract_model_data(enhanced_model)
enhanced_no_impute_data <- extract_model_data(enhanced_model_no_impute)
# Combine the data for comparison
enhanced_comparison_data <- merge(enhanced_impute_data, enhanced_no_impute_data, by = "Term", suffixes = c("_Impute", "_NoImpute"), sort = FALSE)
# View the comparison
print(enhanced_comparison_data)
## Term Estimate_Impute pValue_Impute Significance_Impute
## 1 (Intercept) -1.545 6.147348e-18 ***
## 2 area2 0.287 7.251287e-04 ***
## 3 region2 0.378 0.003 **
## 4 region3 0.182 0.154
## 5 region4 0.341 0.007 **
## 6 region5 -0.053 0.672
## 7 region6 -0.053 0.692
## 8 education_level1 -0.059 0.526
## 9 education_level2 -0.226 0.014 *
## 10 education_level3 -1.172 6.057140e-22 ***
## 11 education_level4 -2.970 7.510453e-09 ***
## 12 education_level5 -2.896 1.143797e-29 ***
## 13 ethnicity2 0.078 0.463
## 14 ethnicity3 0.278 0.030 *
## 15 ethnicity4 1.155 1.603993e-27 ***
## 16 wealth_index2 -0.125 0.143
## 17 wealth_index3 -0.463 1.158858e-05 ***
## 18 wealth_index4 -0.264 0.024 *
## 19 wealth_index5 -0.541 7.444127e-04 ***
## 20 health_insurance -0.052 0.533
## 21 current_contraceptive_use 0.485 1.004528e-14 ***
## 22 awareness_hiv_aids -0.194 0.011 *
## 23 access_to_media -0.079 0.355
## Estimate_NoImpute pValue_NoImpute Significance_NoImpute
## 1 -1.709 2.081962e-13 ***
## 2 0.172 0.087
## 3 0.105 0.511
## 4 0.036 0.818
## 5 -0.104 0.541
## 6 0.061 0.673
## 7 0.103 0.492
## 8 0.125 0.284
## 9 0.078 0.509
## 10 -0.719 1.905736e-06 ***
## 11 -2.830 9.081803e-05 ***
## 12 -2.619 2.273156e-08 ***
## 13 0.551 7.214439e-05 ***
## 14 0.339 0.016 *
## 15 1.901 6.687210e-32 ***
## 16 -0.056 0.607
## 17 -0.343 0.008 **
## 18 -0.150 0.299
## 19 -0.416 0.031 *
## 20 -0.079 0.410
## 21 -0.028 0.704
## 22 -0.086 0.361
## 23 0.178 0.107
# Applying the function to each model
interaction_impute_data <- extract_model_data(model_interaction)
interaction_no_impute_data <- extract_model_data(interaction_model_no_impute)
# Combine the data for comparison
interaction_comparison_data <- merge(interaction_impute_data, interaction_no_impute_data, by = "Term", suffixes = c("_Impute", "_NoImpute"), sort = FALSE)
# View the comparison
print(interaction_comparison_data)
## Term Estimate_Impute pValue_Impute Significance_Impute
## 1 (Intercept) -1.785 4.599871e-17 ***
## 2 area2 0.546 3.488899e-04 ***
## 3 region2 0.365 0.005 **
## 4 region3 0.168 0.192
## 5 region4 0.323 0.011 *
## 6 region5 -0.068 0.593
## 7 region6 -0.061 0.651
## 8 education_level1 -0.061 0.513
## 9 education_level2 -0.219 0.018 *
## 10 education_level3 -1.173 5.833944e-22 ***
## 11 education_level4 -2.969 7.739192e-09 ***
## 12 education_level5 -2.896 3.473185e-29 ***
## 13 ethnicity2 0.062 0.561
## 14 ethnicity3 0.273 0.034 *
## 15 ethnicity4 1.129 4.598972e-26 ***
## 16 wealth_index2 0.267 0.169
## 17 wealth_index3 -0.155 0.464
## 18 wealth_index4 -0.042 0.852
## 19 wealth_index5 -0.328 0.220
## 20 health_insurance -0.042 0.609
## 21 current_contraceptive_use 0.492 4.348509e-15 ***
## 22 awareness_hiv_aids -0.187 0.014 *
## 23 access_to_media -0.072 0.398
## 24 area2:wealth_index2 -0.479 0.026 *
## 25 area2:wealth_index3 -0.383 0.113
## 26 area2:wealth_index4 -0.262 0.306
## 27 area2:wealth_index5 -0.250 0.434
## Estimate_NoImpute pValue_NoImpute Significance_NoImpute
## 1 -1.712 1.561885e-08 ***
## 2 0.151 0.533
## 3 0.126 0.435
## 4 0.058 0.711
## 5 -0.088 0.606
## 6 0.082 0.577
## 7 0.125 0.408
## 8 0.124 0.289
## 9 0.077 0.516
## 10 -0.712 2.393561e-06 ***
## 11 -2.797 1.104547e-04 ***
## 12 -2.584 3.649218e-08 ***
## 13 0.551 7.354197e-05 ***
## 14 0.334 0.017 *
## 15 1.901 2.093819e-31 ***
## 16 0.027 0.922
## 17 -0.354 0.220
## 18 -0.240 0.425
## 19 -0.664 0.075
## 20 -0.075 0.435
## 21 -0.028 0.708
## 22 -0.086 0.366
## 23 0.177 0.108
## 24 -0.116 0.694
## 25 0.011 0.971
## 26 0.135 0.681
## 27 0.377 0.364